import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
import gmaps
import seaborn as sns
low_memory=False
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50110)
pd.set_option('display.width', 1000111)
pd.set_option('display.max_colwidth', 1000111)
Ford Gobike is a regional bike sharing public service in the Bay Area, CA, USA. In 2017, the service brought Ford Motor Company on-board to expand the scale of their operations. Currently, Ford Gobike serves the Bay Area, East Bay and San Jose. (Etherington, Darrell (June 27, 2017). "Ford GoBike launches in the Bay Area starting tomorrow". TechCrunch. Retrieved July 23, 2017.)
They have released their bike share data in the CSV format on their website. We will use their publicly available dataset for this project. This project aims to study the effects of weather on the quality and number of bike rides that took place in the Bay Area in the year of 2018.
Given our datasets, we want to describe them in detail to understand the patterns of weather and biking. Analyzing them will give us insights on how the data are structured and whether there have been any outliers due to events in the area. For instance, an unusually high number of bike rides might be observed due to a concert or a public event in that area.
Contrastingly, the recent California wildfires would have reduced the number of bikers.
Furthermore, The ability to predict the number of rides due to weather can allow the businesses that oversee demand and planning systems to manage them in a more efficient and cost-effective manner.
While doing the above, we will also attempt to understand the characteristics of our user base. We have a limited number of variables describing them, but we will equate the social and economic aspects of our user base to understand their biking patterns.
filepath = "C:/Users/pujar/Dropbox/Semester 2/Python-BUDT758X/Pawject/data/"
desc = pd.read_csv(filepath+"desc_bike.csv",encoding='utf-8')
desc
desc = pd.read_csv(filepath+"desc_weather.csv",encoding='utf-8')
desc
wdf = pd.read_csv(filepath+"SF_Weather_2018.csv",encoding='utf-8')
bdf = pd.read_csv(filepath+'all.csv', encoding='utf-8', sep=',')
#Creating more columns to process the timestamp to date and time separately
bdf.insert(2, 'start_date', 0)
bdf.insert(3, 'end_date', 0)
bdf.insert(4, 'start_timing', 0)
bdf.insert(5, 'end_timing', 0)
# Converting timestamp to string type for easy REGEX-ing (if that's a term!)
bdf['start_time'] = bdf['start_time'].astype(str)
bdf['end_time'] = bdf['end_time'].astype(str)
bdf['start_date'] = bdf['start_time'].str.extract('(\d\d\d\d-\d\d-\d\d)')
bdf['start_timing'] = bdf['start_time'].str.extract('(\d\d:\d\d:\d\d.\d\d\d\d)')
bdf['end_date'] = bdf['end_time'].str.extract('(\d\d\d\d-\d\d-\d\d)')
bdf['end_timing'] = bdf['end_time'].str.extract('(\d\d:\d\d:\d\d.\d\d\d\d)')
# NOTE- Pandas seems to re-add CSV headers every 100,000 or so rows.
# Therefore, we have removed the rows with repeating headers.
bdf = bdf[bdf['start_time'] != 'start_time']
# Merge the two dataframes to get weather data corresponding to bike rides
wdf['Date'] = pd.to_datetime(wdf['Date']).dt.strftime('%Y-%m-%d')
df = pd.merge(bdf, wdf, left_on='start_date', right_on='Date')
del wdf['New Snow']
del wdf['Snow Depth']
wdf.head()
Source: https://www.popsci.com/fires-california-air-quality-cigarettes
from itertools import cycle, islice
# Using different colors to represent data
mycolors= "#6C5B7B"
# Extract the month using REGEX
ser = df['start_date'].str.extract("(-\d{1,2})")
ser[0] = ser[0].str.replace('-','')
ser = ser[0].groupby(ser[0]).count()
# Plot
plt.figure(figsize=(19,7), facecolor='white')
plt.title("Popular biking months")
plt.ylabel('Frequency of rides')
plt.xlabel('Month')
ser.plot.bar(x='lab', y='val', rot=0, color=mycolors)
plt.axvspan(8.5, 10.5, color='black', alpha=0.5)
plt.show()
#Create a new dataset that has count the date as the number of rides
df1=df.Date.value_counts()
df1=df1.to_frame()
df1=df1.rename(columns={'Date':'Number of Bike rides'})
df1.index.names = ['Date']
#sort index as date
df1=df1.sort_index()
from datetime import datetime
import matplotlib.dates as mdate
import matplotlib as mpl
import datetime as dt
import matplotlib.pyplot as py
#Generate two plot--weather pattern and daily usage of bike rides
fig = plt.figure(figsize=(18,12),facecolor='white')
ax1 = fig.add_subplot(2,1,1) # Add the first plot on the top
df1.index = pd.to_datetime(df1.index, format='%Y-%m-%d')#Transform the data type and used for changing the x axis later
plt.title('Daily usage of bike rides - 2018, SF',fontsize=16)
plt.xlabel('Date',fontsize=12)
#Change the x axis as months
ax1.xaxis.set_major_formatter(mdate.DateFormatter('%b %Y'))
dateFmt = mpl.dates.DateFormatter('%Y-%m')
ax1.xaxis.set_major_formatter(dateFmt)
#Give the range of x axis from 2018-1-1 to 2018-12-31 and set the frequncy of x label as months
plt.xticks(pd.date_range('2018-01','2018-12',freq='MS'),rotation=30)
plt.ylabel("Number of bike rides",fontsize=12)
plt.ylim([0,8500])
plt.plot(df1.index,df1['Number of Bike rides'],label='Number of bike rides',c='blue',alpha=0.3)
#Add the second plot on the bottom
fig = plt.figure(figsize=(18,12), facecolor='white')
ax = fig.add_subplot(2,1,2)
wdf['Date'] = pd.to_datetime(wdf['Date'], format='%Y-%m-%d')
plt.plot(wdf['Date'],wdf['Maximum'],label='Maximum temperature',c='#F8B195',alpha=0.5)
plt.plot(wdf['Date'],wdf['Minimum'],label='Minimum temperature',c='#C06C84',alpha=0.5)
plt.title('Daily high and low temperatures - 2018, SF',fontsize=16)
plt.xlabel('Date',fontsize=16)
#Change the x axis again to months
plt.gca().xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdate.MonthLocator())
#Give the range of x axis from 2018-1-1 to 2018-12-31 and set the frequncy of x label as months
plt.xticks(pd.date_range('2018-01-01','2018-12-31',freq="MS"),rotation=30)
plt.ylabel("Temperature ( F )",fontsize=12)
plt.ylim([20,100])
plt.tick_params(axis='both',which='major',labelsize=12)
plt.legend()
plt.show()
# Extract the hour of the day using REGEX
ser = df['start_timing'].astype('str')
ser = ser.str.extract("(\d{1,2})")
ser = ser[0].groupby(ser[0]).count()
# Plot
plt.figure(figsize=(19,7), facecolor='white')
plt.title("Popular biking hours")
plt.ylabel('Frequency of rides')
plt.xlabel('Hour of the day')
sns.pointplot(x=ser.index, y=ser.values,color='red');
plt.show()
print(df['start_station_name'].value_counts().sort_values(ascending=False).head(10))
import gmaps
import gmaps.datasets
key = "AIzaSyAuS0BiPZ4uvjhE3ZHLpO7_ZFzVlW_THJ4"
# Use google maps api
gmaps.configure(api_key=key) # Fill in with your API key
#set up locations
start_locations = df[['start_station_latitude', 'start_station_longitude']]
start_df=start_locations.sample(400000)
start_df1= start_df.groupby(by=['start_station_latitude','start_station_longitude'])
start_df2= pd.DataFrame(start_df1.size())
start_df2.rename(columns={0: 'Numbers'}, inplace=True)
start_df2=start_df2.reset_index()
start_df2['Numbers']=start_df2['Numbers'].astype('float')
start_df2['start_station_latitude']=start_df2['start_station_latitude'].astype('float')
start_df2['start_station_longitude']=start_df2['start_station_longitude'].astype('float')
#set up map
start_fig = gmaps.figure()
heatmap_layer = gmaps.heatmap_layer(
start_df2[['start_station_latitude','start_station_longitude']], weights=start_df2['Numbers'],
max_intensity=100, point_radius=5.0
)
start_fig.add_layer(heatmap_layer)
start_fig
print(df['end_station_name'].value_counts().sort_values(ascending=False).head(10))
#set up locations
end_locations = df[['end_station_latitude','end_station_longitude']]
end_df=end_locations.sample(400000)
end_df1= end_df.groupby(by=['end_station_latitude','end_station_longitude'])
end_df2= pd.DataFrame(end_df1.size())
end_df2.rename(columns={0: 'Numbers'}, inplace=True)
end_df2=end_df2.reset_index()
end_df2['Numbers']=end_df2['Numbers'].astype('float')
end_df2['end_station_latitude']=end_df2['end_station_latitude'].astype('float')
end_df2['end_station_longitude']=end_df2['end_station_longitude'].astype('float')
#set up map
end_fig = gmaps.figure()
heatmap_layer = gmaps.heatmap_layer(
end_df2[['end_station_latitude','end_station_longitude']], weights=end_df2['Numbers'],
max_intensity=100, point_radius=5.0
)
end_fig.add_layer(heatmap_layer)
end_fig
# Replace garbage values with something meaningful- Some more cleaning done
df['member_gender'] = df['member_gender'].str.replace('member_gender','Unknown')
df_gender = df['member_gender'].dropna()
# Get categories and their frequencies
df_gender = df_gender.value_counts().sort_values(ascending=False)
plt.figure(figsize=(19,7), facecolor='white')
plt.title("Gender Overview")
plt.ylabel('Number')
plt.xlabel('Gender')
ax = df_gender.plot.bar(x='lab', y='val', rot=0, color=mycolors)
plt.show()
import numpy as np
import datetime
# Calculate member ages- certain ages are NaN and hence dropped
now = datetime.datetime.now()
df_clean_age = df[df['member_birth_year'] != 'member_birth_year']['member_birth_year']
df_clean_age = df_clean_age.dropna()
ser_age = float(now.year) - df_clean_age.astype('float64')
ser_age_box = ser_age.astype('float64')[ser_age.index<100]
# Create bins for a histogram
# bins = np.linspace(start=ser_age.min(), stop=ser_age.max(), num=13)
# ser_age = pd.cut(ser_age, bins)
ser_age_bar = ser_age.value_counts().sort_index(ascending=True)
ser_age_bar = ser_age_bar[ser_age_bar.index<100].to_frame().reset_index()
# Generate plot
fig, ax = plt.subplots(2, 1, sharex=False, figsize=(19,7), facecolor='white')
plt.title("Frequency of Age")
plt.ylabel('Number of People')
plt.xlabel('Age')
ser_age_box.plot.box(vert=False, ax=ax[1])
ser_age_bar.plot.bar(y='member_birth_year', x='index', rot=0, color=mycolors, ax=ax[0], legend=False)
plt.show()
#Transform the types of "precipitation" and "duration_sec" to float
df['Precipitation'] = df['Precipitation'].astype('float64')
df['duration_sec'] = df['duration_sec'].astype('float64')
#Extract the dataframe which contains the rainy data
df_rainy = df[df['Precipitation']>0]
#Plot the Rain vs duration of bike rides
plt.figure(figsize=(15,5), facecolor='white')
plt.plot(df_rainy['Precipitation'],df_rainy['duration_sec'],'g.')
plt.xlabel('Precipitation')
plt.ylabel('Duration (seconds)')
plt.show()
#Rain vs number of bike rides
num_ride_rainy=df_rainy['Precipitation'].value_counts()
num_ride_rainy
df2 = num_ride_rainy.to_frame()
column = df2.columns[0]
df2['index'] = df2.index.tolist()
plt.figure(figsize=(15,5), facecolor='white')
plt.plot(df2['index'],df2['Precipitation'],'g.')
plt.xlabel('Precipitation')
plt.ylabel('Numbers of bike rides')
plt.show()
#Using group by function to generate two coloums of maximum temperature and start_date under same maximum teperature
#Group by again and calculate the mean usage of the different dates under same maximum temperature
df3=df[['start_date','Maximum']].groupby(["Maximum", "start_date"]).size().reset_index(name="usage").groupby(by='Maximum')['usage'].agg('mean').sort_values(ascending=False)
df3.sort_index().head()
#Generate the plot of Maximum teperature VS Average daily usage of bike rides
fig = plt.figure(figsize=(10,6), facecolor='white')
plt.plot(df3,'y.',label='Average numbers of bike rides')
plt.title('Maximum teperature VS Average daily usage of bike ride')
plt.xlabel('Maximum temperature',fontsize=12)
plt.ylabel('Average numbers of bike rides',fontsize=12)
plt.legend()
plt.show()
fig = plt.figure(figsize=(10,6), facecolor='white')
httable = pd.crosstab(index=df.member_gender, columns=df.bike_share_for_all_trip,normalize='index').loc[['Male','Female','Other']]
sns.heatmap(httable, cmap='YlGnBu', annot=True, square=False, linewidths=1)
plt.xlabel('');
fig = plt.figure(figsize=(10,6), facecolor='white')
httable = pd.crosstab(index=df.user_type, columns=df.bike_share_for_all_trip).loc[['Subscriber','Customer']]
sns.heatmap(httable, cmap='Oranges', annot=True, fmt='d', square=False, linewidths=1)
plt.xlabel('');
fig = plt.figure(figsize=(10,6), facecolor='white')
httable = pd.crosstab(index=df.member_gender, columns=df.user_type,normalize='index').loc[['Male','Female','Other']]
sns.heatmap(httable, cmap='Purples', annot=True, square=False, linewidths=1)
def draw_pred_plot(y):
fig = plt.figure(figsize=(12,8), facecolor='white')
plt.plot(y_test, color = 'red', label = 'Real data')
plt.plot(y, color = 'blue', label = 'Predicted data')
plt.title('Prediction')
plt.legend()
plt.show()
df0 = df[['Average', 'Departure', 'HDD', 'CDD','Precipitation','start_date']]
df0=df0.groupby('start_date').mean()
df1=df[['Average', 'Departure', 'HDD', 'CDD','Precipitation','start_date']]
df1 = df1.groupby('start_date').count()['Average']
df0['count']=df1
df0.head()
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
np.random.seed(12345)
dfr = df0[['Average', 'Departure', 'HDD', 'CDD','Precipitation']]
X_train, X_test, y_train, y_test = train_test_split(dfr, df0['count'].values, test_size=0.2)
lr = LinearRegression(fit_intercept=True)
lr.fit(X_train, y_train)
print('The out-of-sample R2 is', lr.score(X_test, y_test))
pd.DataFrame({'Feature': dfr.columns, 'Coefficient': lr.coef_}, columns=['Feature','Coefficient'])
y = lr.predict(X_test)
draw_pred_plot(y)
r2lr = lr.score(X_test, y_test)
mselr = mean_squared_error(y, y_test)
print("R^2: ", r2lr)
print("MSE Linear Regression: ", mselr)
from sklearn import tree
# Fit the initial tree
tr = tree.DecisionTreeRegressor()
tr.fit(X_train, y_train)
r2tr = tr.score(X_test, y_test)
print('The out-of-sample R2 is', r2tr)
params = {'min_samples_split': [5,10,25,50], 'max_depth': [5,10,25]}
tree_grid = GridSearchCV(estimator=tr, param_grid=params, scoring='r2', cv=5, n_jobs=4)
tree_grid.fit(X_train, y_train)
# Assign best estimator to variable and evaluate performance
tree = tree_grid.best_estimator_
y = tree.predict(X_test)
draw_pred_plot(y)
r2tr = tree.score(X_test, y_test)
msetr = mean_squared_error(y, y_test)
print("Tree R^2: ",r2tr)
print("MSE Tree: ", msetr)
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data, filled=True, rounded=True,special_characters=True, feature_names = X_train.columns)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('Biking.png')
Image(graph.create_png())
from sklearn import ensemble as en
from sklearn import metrics
dt5 = en.AdaBoostRegressor(random_state=1, base_estimator=tree)
regr =dt5.fit(X_train, y_train)
fig=plt.figure()
y = tr.predict(X_test)
draw_pred_plot(y)
r2b = regr.score(X_test, y_test)
mseb = mean_squared_error(y, y_test)
print("Boosting R^2: ", r2b)
y = regr.predict(X_test)
print("MSE Boosting:", mseb)
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
import plotly.graph_objs as go
init_notebook_mode()
data = [go.Bar(
x=X_test.columns.tolist(),
y=regr.feature_importances_.tolist()
)]
iplot(data)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
#Applying Min-Max Feature scaling
from sklearn.preprocessing import MinMaxScaler
min_max_sc = MinMaxScaler()
X_train1 = min_max_sc.fit_transform(X_train)
X_test1 = min_max_sc.transform(X_test)
y_train1=(y_train-y_train.min())/(y_train.max()-y_train.min())
np.random.seed(12345)
nn = Sequential()
nn.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu', input_dim = dfr.shape[1]))
nn.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'relu'))
nn.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['accuracy'])
nn.fit(X_train1, y_train1, batch_size = 10, epochs = 100)
from ann_visualizer.visualize import ann_viz
ann_viz(nn, view=True, filename="neuralnetwork", title="Neural Network")
y_pred = nn.predict(X_test1)
y_pred1= y_pred*(y_train.max()-y_train.min())+y_train.min()
fig=plt.figure()
fig = plt.figure(figsize=(12,8), facecolor='white')
plt.plot(y_test, color = 'red', label = 'Real data')
plt.plot(y_pred1, color = 'blue', label = 'Predicted data')
plt.title('Prediction')
plt.legend()
plt.show()
msenn = mean_squared_error(y_pred1, y_test)
print("MSE Neural Network:",msenn)
r2json = {"Linear Regression":r2lr, "Decision Trees":r2tr, "Boosting":r2b}
fig = plt.figure(figsize=(12,8), facecolor='white')
plt.bar(*zip(*r2json.items()))
plt.xlabel('----Method----')
plt.ylabel('R^2')
plt.show()
msejson = {"Linear Regression":mselr, "Decision Trees":msetr, "Boosting":mseb, "Neural Network":msenn}
fig = plt.figure(figsize=(12,8), facecolor='white')
plt.bar(*zip(*msejson.items()))
plt.xlabel('----Method----')
plt.ylabel('MSE')
plt.show()
Owing to the above results, it is clear that linear regression captures the relationship between independent and dependent variables best (out of the methods we compared). This leads us to believe that the relationship is more linear than non-linear.
The above predictive and explanatory models should help the management at Ford GoBikes plan their resources more effectively. Weather reports are typically released 7 days prior, and that should enable planners at the company to check where demand is highest.
Biking is generally more environmentally friendly as compared to using fossil-fuel-powered vehicles, and would be a blessing to ease traffic congestion in the city (as shown in the Google maps plug in above). Also, it is a good way to relieve stress for both adults and children.
To cater to more people, Ford GoBike could expand their network further in areas such as West San Francisco where there are not many trips originating/ending. The presence of parks such as The Golden Gate Park and Sunset Reservoir park in West San Francisco would be good locations to deploy bikes.
Furthermore, they can create incentive programs for seniors, children and women to take up more biking as is evident from skewed demographic profiles observed above. As they plan on expanding further, they will find our predictive models useful in determining how many rides to expect.